1 Data

1.1 Data Loading

suppressPackageStartupMessages({
    library(CATALYST)
    
    library(flowCore)
    library(ggplot2)
    library(SingleCellExperiment)
    
    library(DT)
    
    library(dplyr)
    library(tidyr)
    library(broom)
    library(stringr)
    library(janitor)
    library(readxl)
    library(nlme)
    library(ggiraph)
    library(viridis)
})
## Warning: replacing previous import 'S4Arrays::read_block' by
## 'DelayedArray::read_block' when loading 'SummarizedExperiment'
data_folder <- "/enna/nobackup/alexq/nadav"

create_dt <- function(x){
  DT::datatable(x,
                extensions = 'Buttons',
                options = list(dom = 'Blfrtip',
                               buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
                               lengthMenu = list(c(10,25,50,-1),
                                                 c(10,25,50,"All"))))
}
export_figs <- FALSE

1.1.1 Extracting gating from wsp

library(CytoML)
library(flowWorkspace)

flowWsDataPath <- file.path(data_folder, "data")
wsp_files <- list.files(flowWsDataPath, pattern = ".wsp")
fcs_folders <- file.path(flowWsDataPath, 
                         c("Debarcoded fcs1", "Debarcoded FCS2", 
                           "Debarcoded FCS3", "Debarcoded FCS4",
                           "Debarcoded fcs1 2024", "Debarcoded fcs 2 2024",
                           "Debarcoded fcs 3 2024", "Debarcoded fcs 4 2024",
                           "Debarcoded fcs 5 2024"))

#' Fixing fcs with SampleID channel
#' ----------------
# library(cytoqc)
# rawfiles <- list.files(fcs_folders[9], ".fcs", recursive = TRUE, full.names = TRUE)
# cqc_data <- cqc_load_fcs(rawfiles)
# 
# check_res <- cqc_check(cqc_data, type = "channel")
# check_res
# 
# match_res <- cqc_match(check_res, ref = 2)
# match_res
# 
# for(i in 1:length(cqc_data)) {
#   if(ncol(cqc_data[[i]]) == 65){
#     print(rawfiles[i])
#     
#     fcs_file <- read.FCS(
#       file.path(rawfiles[i]),
#       column.pattern = "SampleID", # Remove sample ID as only some have it
#       invert.pattern = TRUE,
#       truncate_max_range = FALSE
#     )
#     
#     write.FCS(
#       fcs_file,
#       file.path(rawfiles[i])
#     )
#     
#   }
# }

# fcs_file <- read.FCS(
#     # "data/Debarcoded fcs1 2024/HC233/HC233_CD45_108_LJM_019_TP2_TP10_concat_1.fcs",
#     # "data/Debarcoded fcs1 2024/HC233/HC233_CD45_108_SCH_003_d30_SCH_015_d90_concat_1.fcs",
#     # "data/Debarcoded fcs1 2024/Patients/LJM_019_TP2_concat_1.fcs",
#     # "data/Debarcoded fcs1 2024/Patients/LJM_019_TP10_concat_1.fcs",
#     # "data/Debarcoded fcs1 2024/Patients/SCH_003_d30_concat_1.fcs",
#     "data/Debarcoded fcs1 2024/Patients/SCH_015_d90_concat_1.fcs",
#     "/dskh/nobackup/alexq/nadav/data/Debarcoded fcs 2 2024/Patients/YY_003_TP6_12_concat_1.fcs",
#     column.pattern = "SampleID", # Remove sample ID as only some have it
#     invert.pattern = TRUE,
#     truncate_max_range = FALSE
# )
# 
# write.FCS(
#     fcs_file,
#     # "data/Debarcoded fcs1 2024/HC233/HC233_CD45_108_SCH_003_d30_SCH_015_d90_concat_1.fcs"
#     # "data/Debarcoded fcs1 2024/Patients/LJM_019_TP2_concat_1.fcs"
#     # "data/Debarcoded fcs1 2024/Patients/LJM_019_TP10_concat_1.fcs"
#     # "data/Debarcoded fcs1 2024/Patients/SCH_003_d30_concat_1.fcs"
#     "data/Debarcoded fcs1 2024/Patients/SCH_015_d90_concat_1.fcs"
# )


#' ======================
#' Get the workspace files and label cells by gating
#' ----------------------

priority_cell_paths = c()

getGatedCells <- function(ws_file,
                          ws_sample_group_name,
                          fcs_files_loc) {
  # Parse workspace file
  ws <- CytoML::open_flowjo_xml(ws_file)
  # Get gating set
  gs <- CytoML::flowjo_to_gatingset(ws, 
                                  name = ws_sample_group_name, 
                                  path = fcs_files_loc)
  # Get list of files
  sample_list <- sampleNames(gs)
  # # Vector of fcs file sizes
  # sample_num <- as.numeric(unlist(lapply(strsplit(sample_list, "_"), 
  #                                        function(x) x[length(x)])))
  # names(sample_num) <- unlist(lapply(strsplit(sample_list, "_"), 
  #                                    function(x) paste(x[-length(x)], collapse = "_")))
  # Get annotations for each cell
  cellTypes_list_gs1 <- list()
  cellTypes_df_list_gs1 <- list()
  
  pb <- txtProgressBar(min = 0, max = length(sample_list), style = 3)
  for(s in 1:length(sample_list)) { # iterate through each sample
    leaf_nodes <- gs_get_pop_paths(gs[[s]], order="bfs")
    cellTypes_list_sample <- list()
    for (ln in leaf_nodes) { # iterate through each node
      cellTypes <- character()
      cellTypes<- gh_pop_get_indices(gs[[s]], ln)
      cellTypes_list_sample[[ln]] <- cellTypes
    }
    cellTypes_list_sample <- do.call(cbind, cellTypes_list_sample)
    # Unprioritise cell subset annotations
    idx <- which(colnames(cellTypes_list_sample) %in% c("root", "/singlets", "/first part", "/CD34+ singlets"))
    # Prioritise cell subset annotations
    idx_p <- which(colnames(cellTypes_list_sample) %in% priority_cell_paths)
    idx <- c(idx, seq_len(ncol(cellTypes_list_sample))[-c(idx,idx_p)], idx_p)
    # Pick the furthest classified annotation
    cellTypes <- apply(cellTypes_list_sample[, idx], 1, function(x) {
      res <- names(which(x))
      res[length(res)]
      # return(res)
    })
    cellTypes_list_gs1[[sample_list[s]]] <- cellTypes
    cellTypes_df_list_gs1[[sample_list[s]]] <- cellTypes_list_sample[, idx]
    
    setTxtProgressBar(pb, s)
  }
  close(pb)
  
  # A list of all the cell labels, sorted by fcs file
  names(cellTypes_list_gs1) <- unlist(lapply(strsplit(names(cellTypes_list_gs1), "_"), 
                                             function(x) paste(x[-length(x)], collapse = "_")))
  names(cellTypes_df_list_gs1) <- names(cellTypes_list_gs1)
  
  cell_type_df <- lapply(seq_along(cellTypes_list_gs1), function(i) {
    data.frame(mg_cell_path=cellTypes_list_gs1[[i]], 
               cell_index=1:length(cellTypes_list_gs1[[i]]),
               fcs_name=names(cellTypes_list_gs1)[i])
    }) %>% 
    bind_rows() %>%
    mutate(wsp_name=unlist(lapply(strsplit(ws_file, "/"), function(x) x[length(x)])))
  
  cell_type_df_full <- lapply(seq_along(cellTypes_list_gs1), function(i) {
    data.frame(mg_cell_path=cellTypes_list_gs1[[i]], 
               cell_index=1:length(cellTypes_list_gs1[[i]]),
               fcs_name=names(cellTypes_list_gs1)[i]) %>% 
      bind_cols(
          cellTypes_df_list_gs1[[i]]
      ) 
    })%>%
    bind_rows() %>%
    mutate(wsp_name=unlist(lapply(strsplit(ws_file, "/"), function(x) x[length(x)])))
  
  return(list(cell_type_df=cell_type_df,
              cell_type_df_full=cell_type_df_full,
              leaf_nodes=leaf_nodes))
}


# # Get all leaf nodes and put into excel 

mg_list_mha <- getGatedCells(
  ws_file = file.path("/dskh/nobackup/alexq/nadav/mha_data/MHA_IVoCC_002/BG_20210618_CB_NM.wsp"),
  fcs_files_loc = file.path("/dskh/nobackup/alexq/nadav/mha_data/MHA_IVoCC_002/FCS"),
  ws_sample_group_name = "All Samples"
)

wsp_files <- wsp_files[c(2,7,8,9,1,3,4,5,6)]

mg_list <- lapply(1:9, 
                  function(x) getGatedCells(
                      ws_file = file.path(flowWsDataPath, wsp_files[x]),
                      fcs_files_loc = fcs_folders[x],
                      ws_sample_group_name = "All Samples"
                      )
                  )

saveRDS(
    mg_list,
    file=file.path(data_folder, "data/cell_type_mg_list_b4.RDS")
)

saveRDS(
    mg_list_mha,
    file=file.path(data_folder, "data/cell_type_mg_list_mha.RDS")
)

1.1.2 Creating SCE

1.1.2.1 Creating SCE for 2022 Batch

#' ==================================================
#' Read in data
#' --------------------------------------------------
fcsFiles <- list.files(file.path(data_folder, "data"),
                       pattern=".fcs",
                       recursive = TRUE,
                       full.names = TRUE) %>%
    .[!grepl("Don't USE",.)]

fcsFiles <- fcsFiles[!grepl("2024", fcsFiles)]

fcs_raw <- read.flowSet(fcsFiles, 
                        transformation = FALSE, 
                        truncate_max_range = FALSE)#,
                        # column.pattern = "SampleID", # Remove sample ID as only some have it
                        # invert.pattern = TRUE)

### Get marker information ----------------------------------
# pregating_channels <- c("Bead", "DNA1", "DNA2", "Dead", "Cell_length")
lineage_channels <- c("CD11c","CD56",
                      "CD8a",#"CD8A",
                      "CD57","CD27","CD19","CD45RA",
                      "CD4","KLRG1","CD45RO","CD31",
                      "CD194_CCR4",#"CD194 (CCR4)",
                      "CD197_CCR7",#"CD197 (CCR7)",
                      "CD14","APC","CD16","IgD","Ki67",
                      "CD25","CD3","CD38",
                      "IntB7",#"integrin beta7",
                      "CD127")

# Base the panel off the batch 1 .fcs
fcs_panel <- pData(parameters(fcs_raw[[which(fcsFiles == "/dskh/nobackup/alexq/nadav/data/Debarcoded fcs1/HC229/HC299_CHW_002_d180_CD45_104.fcs")]])) %>%
  select(name, desc) %>%
    mutate(marker = gsub("^[^_]+_", "", desc)) %>%
    mutate(marker_class = ifelse(marker %in% lineage_channels,
                                "type", ifelse(is.na(marker),
                                                  "none",
                                                  "state")))

samp_df <- data.frame(file_name_full=list.files(file.path(data_folder, "data"),
                       pattern=".fcs",
                       recursive = TRUE,
                       full.names = FALSE)) %>%
    filter(!grepl("Don't USE",file_name_full)) %>%
    # mutate(batch = strsplit(file_name_full, split = "/")[[1]][1]) %>%
    tidyr::separate_wider_delim(file_name_full, delim = "/", names = c("batch", "sample_type", "file_name")) %>%
    rowwise() %>% 
    mutate(pat_id = ifelse(sample_type == "HC229" & batch == "Debarcoded fcs1",
                         paste0(strsplit(file_name, split="_")[[1]][2:3], collapse="_"),
                    ifelse(sample_type == "HC229" | sample_type == "HC233",
                         paste0(strsplit(file_name, split="_")[[1]][4:5], collapse="_"),
                         paste0(strsplit(file_name, split="_")[[1]][1:2], collapse="_")))) %>%
    ungroup()

saveRDS(
    lapply(mg_list, function(dat) {
        dat$cell_type_df_full %>%
            left_join(samp_df,
                    by=c("fcs_name"="file_name"))
                    }) %>%
        bind_rows(),
    file=file.path(data_folder, "data/cell_type_full_df_b4.RDS")
)

all_cells_df <- readRDS(file.path(data_folder, "data/cell_type_full_df_b4.RDS"))

# Convert to SingleCellExperiment
sce <- prepData(x=fcs_raw, 
                panel=fcs_panel,
                md=samp_df,
                cofactor = 5,
                panel_cols = list(channel="name",
                                  antigen="desc"),
                md_cols = list(file="file_name",
                               id="pat_id",
                               factors=c("sample_type", "file_name"))
)

# Edit colData
colData <- colData(sce)
colData$file_name <- as.character(colData$file_name)

### Join wsp cell types ----------------------------------
colData$mg_cell_path <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
  left_join(all_cells_df %>%
              select(cell_index, fcs_name, mg_cell_path),
            by=c("cell_index"="cell_index",
                 "file_name"="fcs_name")) %>%
  dplyr::pull(mg_cell_path)

# Add batch information, taken from the workspace name
colData$batch <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
  left_join(all_cells_df %>%
              select(cell_index, fcs_name, wsp_name),
            by=c("cell_index"="cell_index",
                 "file_name"="fcs_name")) %>%
  pull(wsp_name) %>%
  readr::parse_number()

# Add indexes for cells
colData$cell_index <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
    pull(cell_index)

colData(sce) <- colData

# Switch the values for "156Gd_PE" "175Lu_CD279_PD1" for batch 2
sce_exprs_new = assay(sce, "exprs")
sce_exprs_new[c(which(rownames(sce)=="156Gd_CD279_PD1"), which(rownames(sce)=="175Lu_PE")), which(colData$batch==2)] <-
    sce_exprs_new[c(which(rownames(sce)=="175Lu_PE"), which(rownames(sce)=="156Gd_CD279_PD1")), which(colData$batch==2)]

assay(sce, "exprs") <- sce_exprs_new

saveRDS(sce,
        file=file.path(data_folder, "data/sce_b4.rds"))

1.1.2.2 Creating SCE for 2024 Batch

#' ==================================================
#' Read in data
#' --------------------------------------------------
fcsFiles <- list.files(file.path(data_folder, "data"),
                       pattern=".fcs",
                       recursive = TRUE,
                       full.names = TRUE) %>%
    .[!grepl("Don't USE",.)]

fcsFiles <- fcsFiles[grepl("2024", fcsFiles)]

fcs_raw <- read.flowSet(fcsFiles, 
                        transformation = FALSE, 
                        truncate_max_range = FALSE)#,
                        # column.pattern = "SampleID", # Remove sample ID as only some have it
                        # invert.pattern = TRUE)

### Get marker information ----------------------------------
# pregating_channels <- c("Bead", "DNA1", "DNA2", "Dead", "Cell_length")
lineage_channels <- c("CD11c","CD56",
                      "CD8a",#"CD8A",
                      "CD57","CD27","CD19","CD45RA",
                      "CD4","KLRG1","CD45RO","CD31",
                      "CD194_CCR4",#"CD194 (CCR4)",
                      "CD197_CCR7",#"CD197 (CCR7)",
                      "CD14","HLADR_APC","CD16","IgD","Ki67",
                      "CD25","CD3","CD38",
                      "IntegrinB7",#"integrin beta7", 
                      "CD127")

# Base the panel off the batch 1 .fcs
fcs_panel <- pData(parameters(fcs_raw1[[which(fcsFiles == "/dskh/nobackup/alexq/nadav/data/Debarcoded fcs1 2024/HC233/HC233_CD45_108_CHW_001_d90_180.fcs")]])) %>%
  select(name, desc) %>%
    mutate(marker = gsub("^[^_]+_", "", desc)) %>%
    mutate(marker_class = ifelse(marker %in% lineage_channels,
                                "type", ifelse(is.na(marker),
                                                  "none",
                                                  "state")))

samp_df <- data.frame(file_name_full=list.files(file.path(data_folder, "data"),
                       pattern=".fcs",
                       recursive = TRUE,
                       full.names = FALSE)) %>%
    filter(!grepl("Don't USE",file_name_full)) %>%
    # mutate(batch = strsplit(file_name_full, split = "/")[[1]][1]) %>%
    tidyr::separate_wider_delim(file_name_full, delim = "/", names = c("batch", "sample_type", "file_name")) %>%
    rowwise() %>% 
    mutate(pat_id = ifelse(sample_type == "HC229" & batch == "Debarcoded fcs1",
                         paste0(strsplit(file_name, split="_")[[1]][2:3], collapse="_"),
                    ifelse(sample_type == "HC229" | sample_type == "HC233",
                         paste0(strsplit(file_name, split="_")[[1]][4:5], collapse="_"),
                         paste0(strsplit(file_name, split="_")[[1]][1:2], collapse="_")))) %>%
    ungroup()

samp_df <- samp_df[grepl("2024", samp_df$batch),]

saveRDS(
    lapply(mg_list[5:9], function(dat) {
        dat$cell_type_df_full %>%
            left_join(samp_df,
                    by=c("fcs_name"="file_name"))
                    }) %>%
        bind_rows(),
    file=file.path(data_folder, "data/cell_type_full_df_b4_2024.RDS")
)

all_cells_df <- readRDS(file.path(data_folder, "data/cell_type_full_df_b4_2024.RDS"))

all_cells_df <- all_cells_df[grepl("2024", all_cells_df$batch),]

# Convert to SingleCellExperiment
sce <- prepData(x=fcs_raw, 
                panel=fcs_panel,
                md=samp_df,
                cofactor = 5,
                panel_cols = list(channel="name",
                                  antigen="desc"),
                md_cols = list(file="file_name",
                               id="pat_id",
                               factors=c("sample_type", "file_name"))
)

sce <- sce[,sce$file_name != "HC233_CD45_108_LCH_005_d30_d90_d180_concat_1.fcs"]
sce <- sce[,sce$file_name != "HC233_CD45_108_PCH_005_d30_d90_d180_concat_1.fcs"]

all_cells_df <- all_cells_df[all_cells_df$fcs_name != "HC233_CD45_108_PCH_005_d30_d90_d180_concat_1.fcs",]

# Edit colData
colData <- colData(sce)
colData$file_name <- as.character(colData$file_name)

### Join wsp cell types ----------------------------------
colData$mg_cell_path <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
  left_join(all_cells_df %>%
              select(cell_index, fcs_name, mg_cell_path),
            by=c("cell_index"="cell_index",
                 "file_name"="fcs_name")) %>%
  dplyr::pull(mg_cell_path)

# Add batch information, taken from the workspace name
colData$batch <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
  left_join(all_cells_df %>%
              select(cell_index, fcs_name, wsp_name),
            by=c("cell_index"="cell_index",
                 "file_name"="fcs_name")) %>%
  pull(wsp_name) %>%
  readr::parse_number()

# Add indexes for cells
colData$cell_index <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
    pull(cell_index)

colData(sce) <- colData

# Things to switch out (Current to New)
# 209Bi_Tbet to 209Bi_Cd11b
# 146Nd_CD8a to 115In_CD8a
# 147Sm_CD45RO to 152Gd_CD45RO
# 155Gd_EOMES to 146Nd_EOMES
# 153Eu_CD185_CXCR5 to 147Sm_CD185_CXCR5
# Remove TCR Va7.2 from the 2022 SCE
# 153Gd_CD66b to 153Eu_CD66b <- this one is probably not important since it's removed anyway

# Switch the values for "164Dy_CD86_biotin" "175Lu_CD279_PD1" for batch 2
sce_exprs_new = assay(sce, "exprs")
sce_exprs_new[c(which(rownames(sce)=="156Gd_CD279_PD1"), which(rownames(sce)=="175Lu_PE")), which(colData$batch==2)] <-
    sce_exprs_new[c(which(rownames(sce)=="175Lu_PE"), which(rownames(sce)=="156Gd_CD279_PD1")), which(colData$batch==2)]

assay(sce, "exprs") <- sce_exprs_new

saveRDS(sce,
        file=file.path(data_folder, "data/sce_b4_2024.rds"))
#' ==================================================
#' Read in data
#' --------------------------------------------------
fcsFiles <- list.files(file.path(data_folder, "data"),
                       pattern=".fcs",
                       recursive = TRUE,
                       full.names = TRUE) %>%
    .[!grepl("Don't USE",.)]

fcsFiles <- fcsFiles[!grepl("2024", fcsFiles)]

fcs_raw <- read.flowSet(fcsFiles, 
                        transformation = FALSE, 
                        truncate_max_range = FALSE)#,
                        # column.pattern = "SampleID", # Remove sample ID as only some have it
                        # invert.pattern = TRUE)

fcsFiles <- list.files(file.path(data_folder, "data"),
                       pattern=".fcs",
                       recursive = TRUE,
                       full.names = TRUE) %>%
    .[!grepl("Don't USE",.)]

fcsFiles <- fcsFiles[grepl("2024", fcsFiles)]

fcs_raw1 <- read.flowSet(fcsFiles, 
                        transformation = FALSE, 
                        truncate_max_range = FALSE)#,
                        # column.pattern = "SampleID", # Remove sample ID as only some have it
                        # invert.pattern = TRUE)


### Get marker information ----------------------------------
# pregating_channels <- c("Bead", "DNA1", "DNA2", "Dead", "Cell_length")
lineage_channels <- c("CD11c","CD56",
                      "CD8a",#"CD8A",
                      "CD57","CD27","CD19","CD45RA",
                      "CD4","KLRG1","CD45RO","CD31",
                      "CD194_CCR4",#"CD194 (CCR4)",
                      "CD197_CCR7",#"CD197 (CCR7)",
                      "CD14","APC","CD16","IgD","Ki67",
                      "CD25","CD3","CD38",
                      "IntB7",#"integrin beta7",
                      "CD127")

# Base the panel off the batch 1 .fcs
fcs_panel <- pData(parameters(fcs_raw1[[which(fcsFiles == "/dskh/nobackup/alexq/nadav/data/Debarcoded fcs1 2024/HC233/HC233_CD45_108_CHW_001_d90_180.fcs")]])) %>%
  select(name, desc) %>%
    mutate(marker = gsub("^[^_]+_", "", desc)) %>%
    mutate(marker_class = ifelse(marker %in% lineage_channels,
                                "type", ifelse(is.na(marker),
                                                  "none",
                                                  "state")))

fcs_panel_2022 <- pData(parameters(fcs_raw[[which(fcsFiles == "/dskh/nobackup/alexq/nadav/data/Debarcoded fcs1/HC229/HC299_CHW_002_d180_CD45_104.fcs")]])) %>%
  select(name, desc) %>%
    mutate(marker = gsub("^[^_]+_", "", desc)) %>%
    mutate(marker_class = ifelse(marker %in% lineage_channels,
                                "type", ifelse(is.na(marker),
                                                  "none",
                                                  "state")))


fcspanel_2022 <- read.csv("fcs_panel_2022.csv")
fcspanel_2024 <- read.csv("fcs_panel_2024.csv")

comp_fcs_panel <- full_join(fcspanel_2022, fcspanel_2024, by = c("marker", "name", "desc", "marker_class"))


full_join(fcs_panel, fcs_panel_2022, by = c("marker", "name", "desc", "marker_class")) |> View()


### Get sample information ----------------------------------
### Read in flow data



samp_df <- data.frame(file_name_full=list.files(file.path(data_folder, "data"),
                       pattern=".fcs",
                       recursive = TRUE,
                       full.names = FALSE)) %>%
    filter(!grepl("Don't USE",file_name_full)) %>%
    # mutate(batch = strsplit(file_name_full, split = "/")[[1]][1]) %>%
    tidyr::separate_wider_delim(file_name_full, delim = "/", names = c("batch", "sample_type", "file_name")) %>%
    rowwise() %>% 
    mutate(pat_id = ifelse(sample_type == "HC229" & batch == "Debarcoded fcs1",
                         paste0(strsplit(file_name, split="_")[[1]][2:3], collapse="_"),
                    ifelse(sample_type == "HC229" | sample_type == "HC233",
                         paste0(strsplit(file_name, split="_")[[1]][4:5], collapse="_"),
                         paste0(strsplit(file_name, split="_")[[1]][1:2], collapse="_")))) %>%
    ungroup()

samp_df$pat_id[samp_df$pat_id == "TeT_positive"] <- "JE_010"


### Checks to make mg_list and samp_df match up

# check <- lapply(mg_list, function(dat) {
#   unique(dat$cell_type_df_full$fcs_name)
# })
# 
# duplicated(check) |> which()
# 
# check <- unlist(check)
# check <- unique(check)
# 
# setdiff(samp_df$file_name, check)
# setdiff(check, samp_df$file_name)

saveRDS(
    lapply(mg_list, function(dat) {
        dat$cell_type_df_full %>%
            left_join(samp_df,
                    by=c("fcs_name"="file_name"))
                    }) %>%
        bind_rows(),
    file=file.path(data_folder, "data/cell_type_full_df_b4.RDS")
)

all_cells_df <- readRDS(file.path(data_folder, "data/cell_type_full_df_b4.RDS"))

# Convert to SingleCellExperiment
sce <- prepData(x=fcs_raw, 
                panel=fcs_panel,
                md=samp_df,
                cofactor = 5,
                panel_cols = list(channel="name",
                                  antigen="desc"),
                md_cols = list(file="file_name",
                               id="pat_id",
                               factors=c("sample_type", "file_name"))
)

# Edit colData
colData <- colData(sce)
colData$file_name <- as.character(colData$file_name)

### Join wsp cell types ----------------------------------
colData$mg_cell_path <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
  left_join(all_cells_df %>%
              select(cell_index, fcs_name, mg_cell_path),
            by=c("cell_index"="cell_index",
                 "file_name"="fcs_name")) %>%
  dplyr::pull(mg_cell_path)

# Add batch information, taken from the workspace name
colData$batch <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
  left_join(all_cells_df %>%
              select(cell_index, fcs_name, wsp_name),
            by=c("cell_index"="cell_index",
                 "file_name"="fcs_name")) %>%
  pull(wsp_name) %>%
  readr::parse_number()

# Add indexes for cells
colData$cell_index <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
    pull(cell_index)

colData(sce) <- colData

# Switch the values for "156Gd_PE" "175Lu_CD279_PD1" for batch 2
sce_exprs_new = assay(sce, "exprs")
sce_exprs_new[c(which(rownames(sce)=="156Gd_CD279_PD1"), which(rownames(sce)=="175Lu_PE")), which(colData$batch==2)] <-
    sce_exprs_new[c(which(rownames(sce)=="175Lu_PE"), which(rownames(sce)=="156Gd_CD279_PD1")), which(colData$batch==2)]

assay(sce, "exprs") <- sce_exprs_new

saveRDS(sce,
        file=file.path(data_folder, "data/sce_b4.rds"))

2 Merging SCEs

# Things to switch out (Current to New)
# 209Bi_Tbet to 209Bi_Cd11b
# 146Nd_CD8a to 115In_CD8a
# 147Sm_CD45RO to 152Gd_CD45RO
# 155Gd_EOMES to 146Nd_EOMES
# 153Eu_CD185_CXCR5 to 147Sm_CD185_CXCR5
# Remove TCR Va7.2 from the 2022 SCE
# 153Gd_CD66b to 153Eu_CD66b <- this one is probably not important since it's removed anyway

rownames(sce_b4_2024)[rownames(sce_b4_2024) == "209Bi_Tbet"] <- "209Bi_CD11b"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "146Nd_CD8a"] <- "115In_CD8a"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "147Sm_CD45RO"] <- "152Gd_CD45RO"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "155Gd_EOMES"] <- "146Nd_EOMES"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "153Eu_CD185_CXCR5"] <- "147Sm_CD185_CXCR5"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "152Gd_CD66b"] <- "153Eu_CD66b"

rownames(sce_b4_2024)[rownames(sce_b4_2024) == "161Dy_HLADR_APC"] <- "161Dy_APC"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "164Er_CD86_Biotin"] <- "164Dy_CD86_biotin"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "115In_CD31"] <- "155Gd_CD31"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "171Yb_GranzymeB"] <- "171Yb_Granzyme_B"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "173Yb_IntegrinB7"] <- "173Yb_IntB7"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "149Sm_CD366_Tim3"] <- "149Sm_CD366_TIM3"
rownames(sce_b4_2024)[rownames(sce_b4_2024) == "175Lu__PE"] <- "175Lu_PE"

# idx <- order(rownames(sce_b4_2024))
# sce_b4_2024 <- sce_b4_2024[idx,]
# 
# idx <- order(rownames(sce_b4))
# sce_b4 <- sce_b4[idx,]

markers2022 <- rownames(sce_b4)[grepl("_", rownames(sce_b4))]
markers2024 <- rownames(sce_b4_2024)[grepl("_", rownames(sce_b4_2024))]
# markers2022 <- c(markers2022, paste0("tmp"))

data.frame(markers2022, markers2024) |> View()

markers2022 <- markers2022[!markers2022 %in% c("104Pd_CD45_104", "154Gd_CD45_154", "144Nd_FITC_TCR_VB7.1", "162Er_TCR_VB21.3", "167Er_TCR_Va7.2", "174Yb_TCR_VB13.1", "175Lu_PE")]

markers2024 <- markers2024[!markers2024 %in% c("167Er__FITC", "175Lu_PE", "144Nd_TCRgd")]

sce_b4 <- sce_b4[rownames(sce_b4) %in% markers2022]
sce_b4_2024 <- sce_b4_2024[rownames(sce_b4_2024) %in% markers2024]

sce_b4_2024$batch <- sce_b4_2024$batch + 4

rData_2022 <- rowData(sce_b4) |> as.data.frame()
rData_2024 <- rowData(sce_b4_2024) %>% 
  as.data.frame() %>%
  mutate(marker_name = rownames(.)) %>%
  left_join(rData_2022, by = "marker_name") %>%
  select(c(channel_name.y, marker_name, marker_class.y))
colnames(rData_2024) <- colnames(rData_2022)

ref <- rownames(sce_b4)
sce_b4_2024 <- sce_b4_2024[ref,]

sce <- cbind(sce_b4, sce_b4_2024)
saveRDS(sce,
        file=file.path(data_folder, "data/sce_complete.rds"))
#' ==================================================
#' Read in data - MHA
#' --------------------------------------------------
fcsFiles <- list.files(file.path("/dskh/nobackup/alexq/nadav/mha_data/MHA_IVoCC_002"),
                       pattern=".fcs",
                       recursive = TRUE,
                       full.names = TRUE) %>%
  .[!grepl("Don't USE",.)]

fcs_raw2 <- read.flowSet(fcsFiles, 
                        transformation = FALSE, 
                        truncate_max_range = FALSE)


### Get marker information ----------------------------------
lineage_channels <- c("CD11c","CD56",
                      "CD8a",#"CD8A",
                      "CD57","CD27","CD19","CD45RA",
                      "CD4","KLRG1","CD45RO","CD31",
                      "CD194_CCR4",#"CD194 (CCR4)",
                      "CD197_CCR7",#"CD197 (CCR7)",
                      "CD14","APC","CD16","IgD","Ki67",
                      "CD25","CD3","CD38",
                      "IntB7",#"integrin beta7",
                      "CD127")

# Base the panel off the batch 1 .fcs
fcs_panel3 <- pData(parameters(fcs_raw2[[which(fcsFiles == "/dskh/nobackup/alexq/nadav/mha_data/MHA_IVoCC_002/FCS/MHA_002_BMT_CD45_154.fcs")]])) %>%
  select(name, desc) %>%
    mutate(marker = gsub("^[^_]+_", "", desc)) %>%
    mutate(marker_class = ifelse(marker %in% lineage_channels,
                                "type", ifelse(is.na(marker),
                                                  "none",
                                                  "state")))

### Get sample information ----------------------------------
### Read in flow data
samp_df <- data.frame(file_name_full=list.files(file.path("/dskh/nobackup/alexq/nadav/mha_data/MHA_IVoCC_002"),
                       pattern=".fcs",
                       recursive = TRUE,
                       full.names = FALSE)) %>%
    filter(!grepl("Don't USE",file_name_full)) %>%
    mutate(file_name_full = gsub("FCS\\/", 
                                 "", file_name_full)) %>%
    mutate(sample_type = "Samples",
           file_name = gsub( ".*\\/", "", file_name_full )) %>%
    rowwise() %>% 
    mutate(pat_id = ifelse(sample_type == "HC229",
                         paste0(strsplit(file_name, split="_")[[1]][2:3], collapse="_"),
                         paste0(strsplit(file_name, split="_")[[1]][1:2], collapse="_"))) %>%
    ungroup()

saveRDS(
    mg_list_mha$cell_type_df_full %>%
            left_join(samp_df,
                    by=c("fcs_name"="file_name")),
    file=file.path(data_folder, "data/cell_type_full_df_mha.RDS")
)

all_cells_df_mha <- readRDS(file.path(data_folder, "data/cell_type_full_df_mha.RDS"))

# Convert to SingleCellExperiment
sce_mha <- prepData(x=fcs_raw, 
                panel=fcs_panel,
                md=samp_df,
                cofactor = 5,
                panel_cols = list(channel="name",
                                  antigen="desc"),
                md_cols = list(file="file_name",
                               id="pat_id",
                               factors=c("sample_type", "file_name"))
)

# Edit colData
colData <- colData(sce_mha)
colData$file_name <- as.character(colData$file_name)

### Join wsp cell types ----------------------------------
colData$mg_cell_path <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
  left_join(all_cells_df_mha %>%
              select(cell_index, fcs_name, mg_cell_path),
            by=c("cell_index"="cell_index",
                 "file_name"="fcs_name")) %>%
  dplyr::pull(mg_cell_path)

# Add batch information, taken from the workspace name
colData$batch <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
  left_join(all_cells_df_mha %>%
              select(cell_index, fcs_name, wsp_name),
            by=c("cell_index"="cell_index",
                 "file_name"="fcs_name")) %>%
  pull(wsp_name) %>%
  readr::parse_number()

# Add indexes for cells
colData$cell_index <- colData %>%
  as.data.frame() %>%
  group_by(file_name) %>%
  mutate(cell_index=1:n()) %>%
    pull(cell_index)

colData(sce_mha) <- colData
# 
saveRDS(sce_mha,
        file=file.path(data_folder, "data/sce_mha.rds"))

3 Analysis

# Load sce
sce <- readRDS(file.path(data_folder, "data/sce_complete.rds"))

sce_mha <- readRDS(file.path(data_folder, "data/sce_mha.rds"))
# Load cell type information for each cell
all_cells_df <- readRDS(file.path(data_folder, "data/cell_type_full_df_b4.RDS"))

all_cells_df_mha <- readRDS(file.path(data_folder, "data/cell_type_full_df_mha.RDS"))

all_cells_df_comb <- bind_rows(
    all_cells_df,
    all_cells_df_mha
)

unique_fcs <- all_cells_df_comb$fcs_name |> unique()
unique_fcs2 <- all_cells_df_comb %>% distinct(fcs_name) %>% pull()
# Checking if the file names in all_cells_df are in the sce
dif <- setdiff(unique(all_cells_df$fcs_name), unique(sce$file_name))

all_cells_df <- all_cells_df |>
  filter(!fcs_name %in% dif)

dif <- setdiff(unique(sce$file_name), unique(all_cells_df$fcs_name))

if(!isEmpty(dif)) {
  sce <- sce[,sce$file_name != dif]
}

3.1 MEM Analysis

# all_cells_df$dpt <- rep(1000, nrow(all_cells_df))

#' Load data from Nadav's Excel
#' --------------------
nadav_panel_marker <- read_xlsx(file.path(data_folder, "data/Nadav_Panel_2024_v2.xlsx"),
                         sheet="Relevant Markers") %>%
    filter(!is.na(Comments))

nadav_panel_marker_mha <- read_xlsx(file.path(data_folder, "data/Nadav_Panel_2024_v2.xlsx"),
                         sheet="Relevant Markers MHA_002") %>%
    filter(!is.na(Comments))

nadav_tcrvb <- read_xlsx(file.path(data_folder, "data/Nadav_Panel_2024_v2.xlsx"),
                         sheet="TCRVB CyTOF batches",
                         skip=1) %>%
    select(-contains("...")) %>%
    rename("fcs_name"="fcs name")
## New names:
## • `` -> `...33`
# colnames(nadav_tcrvb)[28] <- "TRBV12-3, TRBV12-4"

nadav_tcrvb_long <- nadav_tcrvb %>%
    pivot_longer(cols=!c("#","fcs_name", "Samples", "DPT", "Batch", "channel", "CMV Tetramer A02:01",
                         "Patient ID"), 
                names_to="name", values_to="val")

nadav_tcrvb_filt <- nadav_tcrvb_long %>% 
    filter(grepl("yes", val)) %>%
    rowwise() %>% 
    mutate(sample_id = strsplit(Samples, split=" ")[[1]][1],
           dpt = DPT,
           batch=Batch) %>%
           # dpt = gsub("d", "", strsplit(Samples, split=" ")[[1]][2])) %>%
    mutate(sample_id = gsub("P2-|P1-", "", sample_id) %>% 
        gsub("-", "_", .)) %>%
    ungroup() %>% 
    distinct(fcs_name, sample_id, `Patient ID`,
             name, dpt, batch) %>%
  ###
  # Line where samples beginning with CHW_ are explicitly removed.
  ###
    # mutate(sample_id = gsub("CHW_","", sample_id) %>%
    #            gsub("IVoCC_", "", .)) %>%
    rowwise() %>%
    mutate(sample_id = strsplit(sample_id, split="_")[[1]] %>%
               .[1:2] %>%
               paste0(., collapse="_")) %>%
    ungroup()



# Filtered vbetas
nadav_tcrvb_filt %>%
    create_dt()
# nonExistTRBVs <- unique(nadav_tcrvb_filt$name)[!sapply(unique(nadav_tcrvb_filt$name), function(pat) any(grepl(pat, colnames(all_cells_df))))]
# 
# colnames(all_cells_df)[grepl("TRBV12", colnames(all_cells_df))]
# 
# nadav_tcrvb_filt$name <- ifelse(nadav_tcrvb_filt$name %in% nonExistTRBVs, substr(nadav_tcrvb_filt$name, 1, nchar(nadav_tcrvb_filt$name) - 2), nadav_tcrvb_filt$name)

nadav_panel_marker <- nadav_panel_marker |>
  filter(!`FCS name` %in% c("167Er_TCR_Va7.2", "175Lu_PE", "not used in all samples, specific use"))

nadav_tcrvb_filt$batch[nadav_tcrvb_filt$batch == "1_2024"] <- "5"
nadav_tcrvb_filt$batch[nadav_tcrvb_filt$batch == "2_2024"] <- "6"
nadav_tcrvb_filt$batch[nadav_tcrvb_filt$batch == "3_2024"] <- "7"
nadav_tcrvb_filt$batch[nadav_tcrvb_filt$batch == "4_2024"] <- "8"
nadav_tcrvb_filt$batch[nadav_tcrvb_filt$batch == "5_2024"] <- "9"

# Select patients with vbetas and fcs data
# vbeta_fcs <- intersect(
#     nadav_tcrvb_filt %>% distinct(fcs_name) %>% pull(),
#     unique_fcs2)
# Implement and run MEM from https://www.nature.com/articles/nmeth.4149

# all_cells_df <- all_cells_df |>
#   left_join(nadav_tcrvb_filt)

IQR.thresh <- function(MAGpop, MAGref, IQRpop, IQRref, num_markers){
    #   Use universal IQR threshodling
    IQR_thresh_pop = matrix()
    IQR_thresh_ref = matrix()
    for(i in 1:(num_markers-1)){
        MAGpop_belowThresh <- MAGpop[,i]<=quantile(MAGpop[,i], na.rm=T)[2]
        IQR_thresh_pop[i] <- min(IQRpop[,i][MAGpop_belowThresh])
        MAGref_belowThresh <- MAGref[,i]<=quantile(MAGref[,i], na.rm=T)[2]
        IQR_thresh_ref[i] <- min(IQRref[,i][MAGref_belowThresh])
    }
    IQR_thresh = mean(c(IQR_thresh_pop,IQR_thresh_ref))

    return(IQR_thresh)
}

plotvBetaMarkerWithinPat <- function(pat_id_val, all_pats=FALSE) {
    if (!all_pats) {
        vbeta_pops <- nadav_tcrvb_filt %>%
            # filter(sample_id %in% pat_id_val) %>%
            filter(fcs_name %in% pat_id_val) %>%
            pull(name)
        
    } else {
        vbeta_pops <- unique(nadav_tcrvb_filt$name)
    }

    vbeta_pop_paths <- colnames(all_cells_df)[
            grepl(paste0(paste0(vbeta_pops, "$"), collapse="|"), 
                colnames(all_cells_df)) &
            grepl("CD3\\+\\/non-NK", 
                colnames(all_cells_df))]

    # Subset 
    cd3_cell_ind <- all_cells_df %>%
        # filter patients with a v beta available
        {if(all_pats) . else filter(., fcs_name %in% pat_id_val)} %>%
        # filter CD3+ only
        filter(`/CD3+/non-NK T cells` | `/CD3+/non-NKT T cells`) %>%
        # filter samples only %>%
        filter(sample_type %in% c("Samples", "Patients")) %>% 
        # get dpt from sample file 
        select(-dpt) %>%
        left_join(
            nadav_tcrvb_filt %>%
                {if(all_pats) . else filter(., fcs_name %in% pat_id_val)} %>%
                distinct(fcs_name, dpt, batch),
            by="fcs_name"
        ) %>%
        # select_if(function(col) sum(is.na(col)) == 0 ) %>%
        select(any_of(vbeta_pop_paths), mg_cell_path, cell_index, fcs_name,sample_type,dpt, pat_id) %>%
        left_join(
            colData(sce) %>% 
                as.data.frame() %>%
                mutate(sce_ind = 1:n()) %>%
                select(#sample_id, #dpt, 
                       file_name, cell_index, sce_ind) ,
            by=c(#"pat_id"="sample_id",
                 # "dpt"="dpt",
                 "fcs_name"="file_name",
                 "cell_index"="cell_index")
        ) %>%
        select_if(function(col) any(!is.na(col)))
    
    trbv_cols <- cd3_cell_ind %>%
        select(any_of(vbeta_pop_paths)) %>%
        colnames()
    
    pat_cell_df <- cd3_cell_ind %>%
        bind_cols(
            t(assay(sce, "exprs")) %>%
                as.data.frame() %>%
                select(all_of(nadav_panel_marker$`FCS name`)) %>%
                `colnames<-`(nadav_panel_marker$name) %>%
                dplyr::slice(cd3_cell_ind$sce_ind)
            ) %>%
        pivot_longer(cols=all_of(trbv_cols),
                     names_to="pop",
                     values_to="value") %>%
        filter(!is.na(value)) %>%
        mutate(value = ifelse(value, "vbeta", "rest")) %>%
        select(-c(mg_cell_path, sample_type)) 
    
    if (all_pats) {
        pop_comp_df <- pat_cell_df %>% 
            distinct(pop)
        
        pat_n_cells_df <- pat_cell_df %>% 
            group_by(pop) %>% 
            summarise(n_cells=n(), 
                      n_vbeta = sum(value == "vbeta"),
                      .groups='drop')
    } else {
        pop_comp_df <- pat_cell_df %>% 
            distinct(pop, fcs_name, dpt, pat_id)  %>%
            mutate(vbeta_name = sub(".*/", "", pop)) %>%
            # Filter samples with NA dpt, since they don't have a vbeta listed on Nadavs sheet
            filter(!is.na(dpt)) %>%
            # Need to filter only vbeta patient combinations
            filter(
                grepl(
                    #legal vbeta name combinations
                    nadav_tcrvb_filt %>% 
                        mutate(match_str = paste(name,sample_id,dpt,sep="_")) %>% 
                        pull(match_str) %>% 
                        paste(., collapse="|"),
                    paste(pop,pat_id,dpt,sep="_")
                )
            )
        
        pat_n_cells_df <- pat_cell_df %>% 
            group_by(fcs_name, dpt, pat_id, pop) %>% 
            summarise(n_cells=n(), 
                      n_vbeta = sum(value == "vbeta"),
                      .groups='drop')
    }
    
    num_markers = length(nadav_panel_marker$name)
    num_pops = nrow(pop_comp_df)
        
    MAGpop = matrix(nrow=num_pops,ncol=num_markers)
    MAGref = matrix(nrow=num_pops,ncol=num_markers)
    IQRpop = matrix(nrow=num_pops,ncol=num_markers)
    IQRref = matrix(nrow=num_pops,ncol=num_markers)

    for (i in seq_len(nrow(pop_comp_df))) {
        if (all_pats) {
            filt_dat <- pat_cell_df %>% 
                filter(pop == pop_comp_df$pop[i])
        } else {
            filt_dat <- pat_cell_df %>% 
                filter(fcs_name == pop_comp_df$fcs_name[i] & 
                           dpt  == pop_comp_df$dpt[i] &
                           pat_id == pop_comp_df$pat_id[i] & 
                           pop == pop_comp_df$pop[i])
        }
        MAGpop[i,] = filt_dat %>% 
            filter(value == "vbeta") %>% 
            select(all_of(nadav_panel_marker$name)) %>% 
            summarise_all(function(col) median(col, na.rm=TRUE)) %>%
            t() %>% t()
        
        IQRpop[i,] = filt_dat %>% 
            filter(value == "vbeta") %>% 
            select(all_of(nadav_panel_marker$name)) %>% 
            summarise_all(function(col) IQR(col, na.rm=TRUE)) %>%
            t() %>% t()
        
        MAGref[i,] = filt_dat %>% 
            filter(value == "rest") %>% 
            select(all_of(nadav_panel_marker$name)) %>% 
            summarise_all(function(col) median(col, na.rm=TRUE)) %>%
            t() %>% t()
        IQRref[i,] = filt_dat %>% 
            filter(value == "rest") %>% 
            select(all_of(nadav_panel_marker$name)) %>% 
            summarise_all(function(col) IQR(col, na.rm=TRUE)) %>%
            t() %>% t()
    }
    
    # Remove the line where there is NA, probably due to not filtering the 
    # vbetas for the particular days post
    remove_ind = which(is.na(rowSums(IQRpop)))
    if (!identical(remove_ind, integer(0))) {
        pop_comp_df <- pop_comp_df %>% slice(-remove_ind)
        MAGpop=MAGpop[-remove_ind,]
        IQRpop=IQRpop[-remove_ind,]
        MAGref=MAGref[-remove_ind,]
        IQRref=IQRref[-remove_ind,]
    }
    
    
    IQR_thresh <- IQR.thresh(MAGpop,MAGref,IQRpop,IQRref,num_markers) # nolint

    for (i in seq_len(num_markers)){
        IQRpop[,i] = pmax(IQRpop[,i],IQR_thresh)
        IQRref[,i] = pmax(IQRref[,i],IQR_thresh)
    }

    # Calculate MEM scores
    MAG_diff = MAGpop-MAGref
    MEM_matrix = abs(MAGpop-MAGref)+(IQRref/IQRpop)-1
    MEM_matrix[!(MAG_diff>=0)] <- (-MEM_matrix[!(MAG_diff>=0)])

    # Put MEM values on -10 to +10 scale
    scale_max = max(abs(MEM_matrix[,c(1:ncol(MEM_matrix)-1)]))
    if (nrow(MEM_matrix) == 1) {
        MEM_matrix = c((MEM_matrix[,c(1:ncol(MEM_matrix)-1)]/scale_max)*10,MEM_matrix[,ncol(MEM_matrix)]) %>%
            matrix(nrow=1)
    } else {
        MEM_matrix = cbind((MEM_matrix[,c(1:ncol(MEM_matrix)-1)]/scale_max)*10,MEM_matrix[,ncol(MEM_matrix)])
    }

    colnames(MEM_matrix) <- nadav_panel_marker$name
    colnames(MAGpop) <- nadav_panel_marker$name
    
    pop_mem_df <- pop_comp_df %>%
        bind_cols(MEM_matrix)%>%
        {if (all_pats) left_join(
            ., 
            pat_n_cells_df,
            by=c("pop"="pop")) else left_join(
                .,
                pat_n_cells_df,
                by=c("fcs_name"="fcs_name",
                     "dpt"="dpt",
                     "pat_id"="pat_id",
                     "pop"="pop")
            )}
    
    pop_median_df <- pop_comp_df %>%
        bind_cols(MAGpop) %>%
        {if (all_pats) left_join(
            ., 
            pat_n_cells_df,
            by=c("pop"="pop")) else left_join(
                .,
                pat_n_cells_df,
                by=c("fcs_name"="fcs_name",
                     "dpt"="dpt",
                     "pat_id"="pat_id",
                     "pop"="pop")
            )}
    gc()
    return(list(pop_mem_df = pop_mem_df,
                pop_median_df = pop_median_df))
}

# For Loop instead of lapply
# Check to see if sample 7 is dramatically bigger than everything else

start_time = Sys.time()
vbeta_mem_pat_list = lapply(vbeta_fcs,
                            function(x) {
                                print(x) 
                                return(tryCatch(plotvBetaMarkerWithinPat(x), 
                                                error=function(e) {
                                                    message(e)
                                                    return(NULL)
                                                }))
                                })


vbeta_mem_all_list = lapply("",
                            function(x) plotvBetaMarkerWithinPat(x, 
                                                                 all_pats=TRUE))

mem_res_df <- lapply(vbeta_mem_pat_list, function(x) x$pop_mem_df) %>%
    bind_rows() 

mem_res_all_df <- lapply(vbeta_mem_all_list, function(x) x$pop_mem_df) %>%
    bind_rows() 

vbeta_med_df <- lapply(vbeta_mem_pat_list, function(x) x$pop_median_df) %>%
    bind_rows()

Sys.time() - start_time

save(
    mem_res_df,
    mem_res_all_df,
    vbeta_med_df,
    file=file.path(data_folder, "data/mem_res_dat_b4.RData")
)

Heatmap of MEM results

load(file.path(data_folder, "data/mem_res_dat_b4.RData"))

suppressPackageStartupMessages({
    library(ComplexHeatmap)
    library(circlize)
})

# MEM results by patient
#' ------------------------
col_fun = colorRamp2(c(-10, 0, 10), c("#66a6cc", "#fff8de", "#cc2010"))

tcrvb_filt_vals <- (paste(gsub("^.*/", "", mem_res_df$pop), mem_res_df$fcs_name, sep="_")) %in% 
               (nadav_tcrvb_filt %>% mutate(str=paste(name,fcs_name,sep="_")) %>% pull(str))

markers_to_plot <- nadav_panel_marker %>%
    filter(`Position in heatmap` != "Omit") %>%
    pull(name)

marker_categories <- nadav_panel_marker %>%
    filter(`Position in heatmap` != "Omit") %>%
    mutate(Category = factor(Category, levels=unique(Category))) %>%
    pull(Category)

row_ha = rowAnnotation(
  df = mem_res_df %>%
    filter(tcrvb_filt_vals) %>%
      arrange(pat_id, pop, dpt) %>%
      select(-all_of(nadav_panel_marker$name)) %>%
      select(-fcs_name, -starts_with("n_")) %>% 
      as.data.frame()
)


hm_dat <- mem_res_df %>%
    # filter only the vbetas that were in Nadav's table 
    filter(tcrvb_filt_vals) %>%
    arrange(pat_id, pop, dpt) %>%
    select(all_of(markers_to_plot)) %>% 
    # select_if(function(col) length(unique(col)) > 1) %>%
    as.matrix()

ht_fig1 = Heatmap(
  hm_dat, 
  name = "ht_fig1",
  cluster_rows = FALSE,
  col = col_fun,
  column_split = marker_categories,
  cluster_columns = FALSE,
  row_split = mem_res_df %>%
      filter(tcrvb_filt_vals) %>%
      arrange(pat_id, pop, dpt) %>%
      pull(pat_id),
  row_title_gp  = gpar(fontsize = 5.5),
  column_title_gp  = gpar(fontsize = 8),
  left_annotation  = row_ha,
  right_annotation = rowAnnotation(
      `# Vbeta` = anno_barplot(
          mem_res_df %>%
              filter(tcrvb_filt_vals) %>%
              arrange(pat_id, pop, dpt) %>% 
              select(n_vbeta) %>%
              as.matrix(),
          add_numbers = TRUE,
          beside = TRUE, attach = TRUE,
          width = unit(2, "cm")
      )
  ),
  heatmap_legend_param = list(
    title = "MEM Score"
    )
  )

if (export_figs) {
    pdf.options(width=12, height=18)
    pdf(file=file.path(data_folder,paste0("data/fig/1_ht_fig1_MEM.pdf")))
    print(ht_fig1)
    dev.off()
}


# Median Experssion results by patient
#' ------------------------
col_fun = colorRamp2(c(-3, 0, 3), c("#66a6cc", "#fff8de", "#cc2010"))

row_ha = rowAnnotation(
  df = vbeta_med_df %>%
      filter(tcrvb_filt_vals) %>%
      arrange(pat_id, pop, dpt) %>%
      select(-all_of(nadav_panel_marker$name)) %>%
      select(-fcs_name, -starts_with("n_")) %>% 
      as.data.frame()
)

hm_dat <- vbeta_med_df %>%
      filter(tcrvb_filt_vals) %>%
    arrange(pat_id, pop, dpt) %>%
    select(all_of(markers_to_plot)) %>% 
    # select_if(function(col) length(unique(col)) > 1) %>%
    mutate_all(function(col) scale(col)[,1]) %>%
    as.matrix()

ht_fig2 = Heatmap(
  hm_dat, 
  name = "ht_fig1",
  column_title = "",
  cluster_rows = FALSE,
  col = col_fun,
  column_split = marker_categories,
  cluster_columns = FALSE,
  row_split = mem_res_df %>%
      filter(tcrvb_filt_vals) %>%
      arrange(pat_id, pop, dpt) %>%
      pull(pat_id),
  row_title_gp  = gpar(fontsize = 8),
  column_title_gp  = gpar(fontsize = 8),
  left_annotation  = row_ha,
  right_annotation = rowAnnotation(
      `# Vbeta` = anno_barplot(
          vbeta_med_df %>%
              filter(tcrvb_filt_vals) %>%
              arrange(pat_id, pop, dpt) %>% 
              select(n_vbeta) %>%
              as.matrix(),
          add_numbers = TRUE,
          beside = TRUE, attach = TRUE,
          width = unit(2, "cm")
      )
  ),
  heatmap_legend_param = list(
    title = "Z-Scores (of median expression)"
    )
  )

ht_fig2

if (export_figs) {
    pdf.options(width=12, height=18)
    pdf(file=file.path(data_folder,paste0("data/fig/2_ht_fig2_zscore.pdf")))
    print(ht_fig2)
    dev.off()
}
# MEM results over all samples
#' ------------------------
col_fun = colorRamp2(c(-10, 0, 10), c("#66a6cc", "#fff8de", "#cc2010"))

clean_mem_res_all_df <- mem_res_all_df %>%
    mutate(pop = sub(".*TRBV", "TRBV", pop)) %>%
    group_by(pop) %>%
    summarise(across(everything(), mean, na.rm = TRUE)) %>%
    ungroup()
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(everything(), mean, na.rm = TRUE)`.
## ℹ In group 1: `pop = "TRBV10-3"`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
row_ha = rowAnnotation(
  df = clean_mem_res_all_df %>%
      select(-all_of(nadav_panel_marker$name)) %>%
      select(-starts_with("n_")) %>%
      as.data.frame()
)

hm_dat <- clean_mem_res_all_df %>%
    select(all_of(markers_to_plot)) %>% 
    # select_if(function(col) length(unique(col)) > 1) %>%
    as.matrix()

    
ht_fig1 = Heatmap(
  hm_dat, 
  name = "ht_fig1",
  column_title = "",
  cluster_rows = FALSE,
  col = col_fun,
  column_split = marker_categories,
  cluster_columns = FALSE,
  row_title_gp  = gpar(fontsize = 8),
  column_title_gp  = gpar(fontsize = 8),
  left_annotation  = row_ha,
  right_annotation = rowAnnotation(
      `# Vbeta` = anno_barplot(
          clean_mem_res_all_df %>%
              select(n_vbeta) %>%
              as.matrix(),
          add_numbers = TRUE,
          beside = TRUE, attach = TRUE,
          width = unit(2, "cm")
      )
  ),
  heatmap_legend_param = list(
    title = "MEM Score"
    )
  )

ht_fig1

export_figs <- TRUE
if (export_figs) {
    pdf.options(width=12, height=7)
    pdf(file=file.path(data_folder,paste0("data/fig/3_ht_fig1_all.pdf")))
    print(ht_fig1)
    dev.off()
}
## png 
##   2
export_figs <- FALSE

3.2 Vbeta vs Rest - Median Exprs

all_cells_df_comb$dpt <- rep(1000, nrow(all_cells_df_comb))

#' Compute median expression values for all vbeta / rest combinations
#' ---------------

all_pats=TRUE

if (!all_pats) {
    vbeta_pops <- nadav_tcrvb_filt %>%
        # filter(sample_id %in% pat_id_val) %>%
        filter(fcs_name %in% pat_id_val) %>%
        pull(name)
    
} else {
    vbeta_pops <- unique(nadav_tcrvb_filt$name)
}

vbeta_pop_paths <- colnames(all_cells_df_comb)[
    grepl(paste0(paste0(vbeta_pops, "$"), collapse="|"), 
          colnames(all_cells_df_comb)) &
        grepl("CD3\\+\\/non-NK", 
              colnames(all_cells_df_comb))]

# Subset 
cd3_cell_ind <- all_cells_df_comb %>%
    # filter patients with a v beta available
    {if(all_pats) . else filter(., fcs_name %in% pat_id_val)} %>%
    # filter CD3+56- only
    filter(`/CD3+/non-NK T cells`) %>%
    # filter samples only %>%
    filter(sample_type %in% c("Samples", "Patients")) %>% 
    # get dpt from sample file 
    select(-dpt) %>%
    left_join(
        nadav_tcrvb_filt %>%
            {if(all_pats) . else filter(., fcs_name %in% pat_id_val)} %>%
            distinct(fcs_name, dpt, batch),
        by="fcs_name"
    ) %>%
    # select_if(function(col) sum(is.na(col)) == 0 ) %>%
    select(any_of(vbeta_pop_paths), mg_cell_path, cell_index, fcs_name,sample_type, dpt, pat_id) %>%
    left_join(
        colData(sce) %>% 
            as.data.frame() %>%
            bind_rows(
                colData(sce_mha) %>% 
                    as.data.frame()
            ) %>%
            mutate(sce_ind = 1:n()) %>%
            select(#sample_id, #dpt, 
                file_name, cell_index, sce_ind) ,
        by=c(#"pat_id"="sample_id",
            # "dpt"="dpt",
            "fcs_name"="file_name",
            "cell_index"="cell_index")
    ) %>%
    select_if(function(col) any(!is.na(col)))

trbv_cols <- cd3_cell_ind %>%
    select(any_of(vbeta_pop_paths)) %>%
    colnames()

pat_cell_df <- cd3_cell_ind %>%
    bind_cols(
        t(assay(sce, "exprs")) %>%
            as.data.frame() %>%
            select(all_of(nadav_panel_marker$`FCS name`)) %>%
            `colnames<-`(nadav_panel_marker$name) %>%
            bind_rows(
                t(assay(sce_mha, "exprs")) %>%
                    as.data.frame() %>%
                    select(all_of(nadav_panel_marker_mha$`FCS name`)) %>%
                    `colnames<-`(nadav_panel_marker_mha$name)
            ) %>%
            dplyr::slice(cd3_cell_ind$sce_ind)
    ) %>% 
    pivot_longer(cols=all_of(trbv_cols),
                 names_to="pop",
                 values_to="value") %>%
    filter(!is.na(value)) %>%
    mutate(value = ifelse(value, "vbeta", "rest")) %>%
    select(-c(mg_cell_path, sample_type)) 

pop_comp_df <- pat_cell_df %>% 
    distinct(pop, fcs_name, dpt, pat_id)  %>%
    mutate(vbeta_name = sub(".*/", "", pop)) %>%
    # Filter samples with NA dpt, since they don't have a vbeta listed on Nadavs sheet
    filter(!is.na(dpt)) %>%
    # Need to filter only vbeta patient combinations
    filter(
        grepl(
            #legal vbeta name combinations
            nadav_tcrvb_filt %>% 
                mutate(match_str = paste(name,sample_id,dpt,sep="_")) %>% 
                pull(match_str) %>% 
                paste(., collapse="|"),
            paste(pop,pat_id,dpt,sep="_")
        )
    )

pat_n_cells_df <- pat_cell_df %>% 
    group_by(fcs_name, dpt, pat_id, pop) %>% 
    summarise(n_cells=n(), 
              n_vbeta = sum(value == "vbeta"),
              .groups='drop')

num_markers = length(nadav_panel_marker$name)
num_pops = nrow(pop_comp_df)

MAGpop = matrix(nrow=num_pops,ncol=num_markers)
MAGref = matrix(nrow=num_pops,ncol=num_markers)
IQRpop = matrix(nrow=num_pops,ncol=num_markers)
IQRref = matrix(nrow=num_pops,ncol=num_markers)

for (i in seq_len(nrow(pop_comp_df))) {
    filt_dat <- pat_cell_df %>% 
        filter(fcs_name == pop_comp_df$fcs_name[i] & 
                   dpt  == pop_comp_df$dpt[i] &
                   pat_id == pop_comp_df$pat_id[i] & 
                   pop == pop_comp_df$pop[i])

    MAGpop[i,] = filt_dat %>% 
        filter(value == "vbeta") %>% 
        select(all_of(nadav_panel_marker$name)) %>% 
        summarise_all(function(col) median(col, na.rm=TRUE)) %>%
        t() %>% t()
    
    IQRpop[i,] = filt_dat %>% 
        filter(value == "vbeta") %>% 
        select(all_of(nadav_panel_marker$name)) %>% 
        summarise_all(function(col) IQR(col, na.rm=TRUE)) %>%
        t() %>% t()
    
    MAGref[i,] = filt_dat %>% 
        filter(value == "rest") %>% 
        select(all_of(nadav_panel_marker$name)) %>% 
        summarise_all(function(col) median(col, na.rm=TRUE)) %>%
        t() %>% t()
    IQRref[i,] = filt_dat %>% 
        filter(value == "rest") %>% 
        select(all_of(nadav_panel_marker$name)) %>% 
        summarise_all(function(col) IQR(col, na.rm=TRUE)) %>%
        t() %>% t()
}

# Remove the line where there is NA, probably due to not filtering the 
# vbetas for the particular days post
remove_ind = which(is.na(rowSums(IQRpop)))
if (!identical(remove_ind, integer(0))) {
    pop_comp_df <- pop_comp_df %>% slice(-remove_ind)
    MAGpop=MAGpop[-remove_ind,]
    IQRpop=IQRpop[-remove_ind,]
    MAGref=MAGref[-remove_ind,]
    IQRref=IQRref[-remove_ind,]
}


colnames(MAGref) <- nadav_panel_marker$name
colnames(MAGpop) <- nadav_panel_marker$name

pop_median_df <- pop_comp_df %>%
    bind_cols(MAGpop) %>%
    mutate(cell_type = "vbeta") %>%
    bind_rows(
        pop_comp_df %>%
            bind_cols(MAGref) %>%
            mutate(cell_type = "rest")
    ) %>%
    left_join(
        .,
        pat_n_cells_df,
        by=c("fcs_name"="fcs_name",
             "dpt"="dpt",
             "pat_id"="pat_id",
             "pop"="pop")
    )


saveRDS(pop_median_df,
        file=file.path(data_folder, "data/pop_median_df_CD3+56-.rds"))

3.2.0.1 Paired t-test

## tooltip css for interactive plot
tooltip_css <- "border-style: solid; border-color: #c3c3c3; border-radius: 8px; border-width: 0.5px; padding: 5px; box-shadow: 2px 2px 3px 0px #bbbbbb; background-color: white; font: menu;"

pop_median_df <- readRDS(file.path(data_folder, "data/pop_median_df_CD3+56-.rds")) %>%
    # remove duplicate for CT_023
    filter(!(
        pop == "/CD3+/non-NK T cells/FITC-/TRBV25-1" &
            pat_id == "CT_023"
    ))


pop_median_df <- pop_median_df %>%
  filter(!(
        pop == "/CD3+/non-NK T cells/FITC-/TRBV25-1" &
            pat_id == "CT_023"
    ))

# Convert df to long format for graphing
pop_med_vbeta_rest_df <- pop_median_df %>%
    left_join(
        nadav_tcrvb_filt %>%
            distinct(fcs_name, `Patient ID`),
        by=c("fcs_name")
    ) %>%
    mutate(prop_vbeta = n_vbeta/(n_cells),
           ratio_vbeta = n_vbeta/(n_cells-n_vbeta)) %>%
    pivot_longer(cols=nadav_panel_marker$name,
                 names_to="marker",
                 values_to="exprs") %>%
    pivot_wider(names_from="cell_type",
                values_from="exprs")  %>%
    # Exclude markers
    filter(!(marker %in% c("CD11c", "CD14", "CD16", "CD25",
                           "CD66b", "CD56", "CMV_tetramer", "IgD", 
                           "CD19", "CD3", "CD86", "TCR Va7.2"))) %>% 
    filter(dpt > 0) 

pop_med_vbeta_rest_df %>% 
    select(-c("prop_vbeta","ratio_vbeta")) %>%
    mutate_if(is.numeric, function(x) round(x,2)) %>%
    create_dt()
# T-test for each vbeta & dpt population
pop_med_vbeta_rest_ttest <- pop_med_vbeta_rest_df %>%
    group_by(marker) %>%
    group_modify( ~ {
        t.test(.$vbeta,
               .$rest,
               alternative = "two.sided",
               paired = T) %>%
            tidy()
    }) %>%
    ungroup() %>%
    mutate(p_adj = p.adjust(p.value, method="fdr")) %>%
    select(marker, estimate, p.value, p_adj)

pop_med_vbeta_rest_df <- pop_med_vbeta_rest_df %>% filter(as.numeric(dpt) < 270)

p_med_exprs <- pop_med_vbeta_rest_df %>%
    # remove uninformative variables
    filter(!(marker %in% c("CD56"))) %>%
    # filter(as.numeric(dpt) < 270) %>%
    ggplot()+
    aes(x=vbeta,y=rest,
        fill=pmin(180, as.numeric(dpt)),
        data_id=pat_id,
        tooltip=sprintf(
            "<b>Pat ID</b>: %s \n  <b>DPT</b>: %s \n <b>Median Expr (vbeta)</b>: %s \n <b>Median Expr (rest)</b>: %s \n pop: %s",
            pat_id, dpt, signif(vbeta,3), signif(rest,3), pop))+
    geom_text(data=pop_med_vbeta_rest_ttest,
              aes(label=sprintf("p=%s", signif(p_adj,3))),
              inherit.aes = FALSE,
              x = -Inf, y = Inf, hjust = -0.1, vjust = 1.4,
              size=2.5)+
    geom_point_interactive(alpha=0.75,
                           colour="gray60",pch=21)+
    geom_abline(slope=1,linetype="dotted")+
    facet_wrap(~marker, scales="free")+
    scale_fill_gradient2(midpoint=90, 
                         low="yellow", 
                         mid="#ff006e",
                         high="#3a86ff", 
                         breaks=c(30, 90, 180))+
    # scale_size(range=c(1.5,4))+
    theme_bw()+
    labs(title="Median Expression of each non vbeta CD3+ vs vbeta")

ggiraph::girafe(
    ggobj=p_med_exprs,
    height_svg = 12,
    width_svg = 14,
    options = list(
        opts_hover(css = "stroke-width:2"),
        opts_tooltip(css = tooltip_css)
    )
)
if (export_figs) {
    pdf.options(width=14, height=12)
    pdf(file=file.path(data_folder,paste0("data/fig/4_p_med_exprs.pdf")))
    print(p_med_exprs)
    dev.off()
}
library(ggpubr)
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:janitor':
## 
##     make_clean_names
## The following object is masked from 'package:flowCore':
## 
##     filter
## The following object is masked from 'package:IRanges':
## 
##     desc
## The following object is masked from 'package:stats':
## 
##     filter
library(latex2exp)

sigFunc = function(x){
  if(x < 0.001){"***"} 
  else if(x < 0.01){"**"}
  else if(x < 0.05){"*"}
    else if (x<0.1){"°"}
  else{NA}
}


cbp1 <- c("#e69f00", "#56b4e9", "#009e73", "#f0e442",
          "#cc79a7", "#0072b2", "#d55e00", "#999999",
          "#f27c7c", "#279a98", "#9a3f44", "#a53093",
          "#4c5e66")

stat_test_res <- pop_med_vbeta_rest_df %>%
    # filter(dpt <= 90) %>%
    pivot_longer(
        cols = c("vbeta", "rest"),
        names_to = "cell_group",
        values_to = "median_expr"
    ) %>%
    arrange(pat_id, dpt, pop) %>%
    group_by(marker) %>%
    t_test(median_expr ~ cell_group, paired = TRUE) %>%
    arrange(marker) %>% 
    adjust_pvalue(method = "fdr") %>%
    add_significance("p.adj") %>%
    add_xy_position(x = "marker", fun = "max") %>%
    rowwise() %>%
    mutate(signif = sigFunc(p.adj)) %>%
    ungroup()
    
stat_test_res %>% 
    select(marker, statistic, p, p.adj, signif) %>%
    mutate_if(is.numeric, function(x) signif(x, 3)) %>%
    create_dt()
boxplot <- ggboxplot(
  pop_med_vbeta_rest_df %>%
    pivot_longer(
        cols = c("vbeta", "rest"),
        names_to = "cell_group",
        values_to = "median_expr"
    ) %>%
      mutate(marker=factor(marker, levels=stat_test_res$marker)), 
  x = "marker", y = "median_expr", 
  color = "cell_group", 
  fill = "cell_group",
  alpha=0.3,
  palette = c("#00AFBB", "#E7B800")
  ) + 
    stat_pvalue_manual(stat_test_res,  label = "signif", tip.length = 0) +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    labs(title="Comparison of median expression of each vbeta/dpt",
         subtitle="Paired t-test with fdr adjustment")
boxplot
## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`geom_bracket()`).

if (export_figs) {
    pdf.options(width=12, height=6.5)
    pdf(file=file.path(data_folder,paste0("data/fig/5_boxplot.pdf")))
    print(boxplot)
    dev.off()
}

3.2.0.2 Read in metadata

nadav_metadata <- read_xlsx(file.path(data_folder, "data/Nadav_Panel_2024_v2.xlsx"),
                            sheet="metadata_Patient") %>%
    select(-contains("...")) %>%
    rename("pat_id" = "patient_id") %>%
    janitor::clean_names()
## New names:
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...27`
vars <- c("malignant", "pt_cy", "tc_rab_cd19_dep", "serostatus_positive", "cmv_reactivation_any_0_1", "cmv_disease_0_1", "viral_2", "alive_0_dead_1_at_365_dpt", "stem_cell_source", "adoptive_cell_therapy", "sex")

nadav_metadata[vars] <- lapply(nadav_metadata[vars], factor)

pop_med_vbeta_all_df <- pop_med_vbeta_rest_df |>
  left_join(nadav_metadata, by = "pat_id")

3.2.0.3 Fit Linear Regression models

pop_med_vbeta_all_df <- pop_med_vbeta_all_df %>%
    # rename("deceased" = "alive_0_dead_1_at_365_dpt") %>%
    mutate(pat_id_old = pat_id) %>%
    mutate(pat_id = sprintf("%s-%s", `Patient ID`,
                            pat_id)) %>%
    arrange(`Patient ID`) %>%
    mutate(pat_id = factor(pat_id,
                           levels= unique(pat_id)))

#' Perform regression over each marker 
#' -------------
vbeta_med_all_mixregr_p <- pop_med_vbeta_all_df %>%
    group_by(marker) %>%  
    group_modify( ~ {
        m1 <- lme(
            vbeta ~ malignant + pt_cy + tc_rab_cd19_dep + serostatus_positive + cmv_reactivation_any_0_1 + cmv_disease_0_1 + viral_2 + alive_0_dead_1_at_365_dpt + stem_cell_source + adoptive_cell_therapy + sex,
            random =  ~ 1 | pat_id,
            data = data.frame(
                vbeta = .$vbeta,
                pat_id = .$pat_id,
                .[vars]
            ))
        
        anova(m1) %>%
            as.data.frame() %>%
            mutate(var=rownames(.))
    }) %>%
    ungroup()  %>%
    filter(var != "(Intercept)") %>%
    mutate(p_value = p.adjust(`p-value`, "fdr")) %>%
    rename(p_value_unadjusted = "p-value") %>%
    arrange(p_value)

vbeta_med_all_mixregr_p %>%
  create_dt()
pop_med_vbeta_rest_dpt <- pop_med_vbeta_rest_df %>%
    mutate(vbeta_stage = ifelse(dpt < 30, "early", "late") %>%
               as.factor()) %>%
    mutate(deceased = ifelse(pat_id %in% c("MHA_002", "SCH_007", "SCH_002", "LCH_003"),
                             TRUE,
                             FALSE)) %>%
    mutate(pat_id_old = pat_id) %>%
    mutate(pat_id = sprintf("%s-%s", `Patient ID`,
                            pat_id)) %>%
    arrange(`Patient ID`) %>%
    mutate(pat_id = factor(pat_id,
                           levels= unique(pat_id)))
    # filter(n_vbeta > 200)

# pop_med_vbeta_rest_dpt <- pop_med_vbeta_all_df %>%
#     mutate(vbeta_stage = ifelse(dpt < 30, "early", "late") %>%
#                as.factor()) %>%
#     rename("deceased" = "alive_dead_any_time") %>%
#     mutate(pat_id_old = pat_id) %>%
#     mutate(pat_id = sprintf("%s-%s", `Patient ID`,
#                             pat_id)) %>%
#     arrange(`Patient ID`) %>%
#     mutate(pat_id = factor(pat_id,
#                            levels= unique(pat_id)))
#  + deceased + malignant + sex + study + stem_cell_source + adoptive_cell_therapy + viral_2 + serostatus_positive
#' Perform regression over each marker 
#' -------------
pop_med_vbeta_rest_dpt$dpt <- as.numeric(pop_med_vbeta_rest_dpt$dpt)
vbeta_med_dpt_regr_p <- pop_med_vbeta_rest_dpt %>%
    group_by(marker) %>%  
    group_modify( ~ {
        lm(vbeta ~ dpt,
           data=data.frame(vbeta=.$vbeta, dpt=.$dpt)) %>%
            tidy()
    }) %>%
    ungroup() %>%
    filter(term!="(Intercept)") %>%
    mutate(p_value = p.adjust(`p.value`, "fdr")) %>%
    rename(p_value_unadjusted = "p.value") %>%
    arrange(p_value)

vbeta_med_dpt_regr_p %>%
    create_dt()

3.2.0.4 Linear Mixed Model

#' Perform mixed effects regression over each marker 
#' -------------
library(nlme)

vbeta_med_dpt_mixregr_p <- pop_med_vbeta_rest_dpt %>%
    group_by(marker) %>%  
    group_modify( ~ {
        m1 <- lme(
            vbeta ~ dpt,
            random =  ~ 1 | pat_id,
            data = data.frame(
                vbeta = .$vbeta,
                dpt = .$dpt,
                pat_id = .$pat_id
            ))
        
        anova(m1) %>%
            as.data.frame() %>%
            mutate(var=rownames(.))
    }) %>%
    ungroup()  %>%
    filter(var == "dpt") %>%
    mutate(p_value = p.adjust(`p-value`, "fdr")) %>%
    rename(p_value_unadjusted = "p-value") %>%
    arrange(p_value)


vbeta_med_dpt_mixregr_p %>%
    create_dt()
lmer_list <- lapply(
    pop_med_vbeta_rest_dpt %>% distinct(marker) %>% pull(),
    function(marker_val) {
        m1 <- lme(
            vbeta ~ dpt,
            random =  ~ 1 | pat_id,
            data = pop_med_vbeta_rest_dpt %>%
                filter(marker == marker_val))
        return(m1)
    }
) %>%
    `names<-`(pop_med_vbeta_rest_dpt %>% 
                  distinct(marker) %>% 
                  pull())

#' Get coefficients for each patient
# lmer_df <- lapply(names(lmer_list),
#                   function(marker_val) {
#                       coef(lmer_list[[marker_val]]) %>%
#                           rename(intercept = `(Intercept)`, slope = dpt) %>%
#                           tibble::rownames_to_column("group") %>%
#                           mutate(marker = marker_val)
#                   }) %>%
#     bind_rows()

lmer_df <- lapply(names(lmer_list),
                  function(marker_val) {
                      lmer_list[[marker_val]]$coefficients$fixed %>%  
                          as.data.frame() %>% 
                          t() %>% 
                          as.data.frame() %>% 
                          rename(intercept = `(Intercept)`, slope = dpt) %>%
                          mutate(marker = marker_val)
                  }) %>%
    bind_rows()
#' Graph Median expression of vbeta over dpt
#' ---------
p_med_exprs_dpt_col <- pop_med_vbeta_rest_dpt %>%
    ggplot() +
    # geom_abline(
    #     aes(intercept=intercept,
    #         slope=slope),
    #     data=lmer_df,
    #     linewidth=.5,
    #     alpha=.4
    # ) +
    geom_smooth(
        aes(
            x = dpt,
            y = vbeta,
            col = as.factor(pat_id),
            fill = as.factor(pat_id)
        ),
        method = "lm",
        formula = 'y ~ x',
        linewidth = 0.4,
        alpha = 0.3,
        se=FALSE
    ) +
    geom_point_interactive(aes(
        x = dpt,
        y = vbeta,
        data_id = pat_id,
        tooltip = sprintf(
            "<b>Pat ID</b>: %s \n  <b>DPT</b>: %s \n <b>Median Expr (vbeta)</b>: %s \n <b>vbeta</b>: %s \n <b># cells</b>: %s \n <b># vbeta</b>: %s",
            pat_id,
            dpt,
            signif(vbeta, 3),
            pop,
            n_cells,
            n_vbeta
        ),
        col = as.factor(pat_id)
    ),
    alpha = 0.75) +
    geom_text(aes(x=Inf, y=Inf, 
                  label=TeX(sprintf("$p_{}: %s$",signif(p_value,3)), output = "character")),
              vjust=1.3, hjust=1.01,
              parse = TRUE,
              data=vbeta_med_dpt_mixregr_p %>%
                  left_join(vbeta_med_dpt_regr_p %>%
                                rename(p_value_lr = p_value) %>%
                                select(p_value_lr, marker),
                            by="marker"),
              size=3) +
    facet_wrap(~ marker, scales = "free") +
    scale_color_manual(
        values = as.vector(c(pals::alphabet(), pals::alphabet2())),
        breaks = pop_med_vbeta_rest_dpt %>% distinct(pat_id) %>% pull()
    ) +
    scale_fill_manual(
        values = as.vector(c(pals::alphabet(), pals::alphabet2())),
        breaks = pop_med_vbeta_rest_dpt %>% distinct(pat_id) %>% pull(),
        guide = "none"
    ) +
    theme_bw() +
    labs(title = "", 
         subtitle="",
         color="Patient ID") 

p_med_exprs_dpt_col

p_med_exprs_dpt_bw <- pop_med_vbeta_rest_dpt %>%
    ggplot() +
    geom_abline(
        aes(intercept=intercept,
            slope=slope),
        data=lmer_df,
        linewidth=.5,
        alpha=.4
    ) +
    # geom_smooth(
    #     aes(
    #         x = dpt,
    #         y = vbeta
    #     ),
    #     method = "lm",
    #     formula = 'y ~ x',
    #     linewidth = 0.4,
    #     alpha = 0.3,
    #     se=FALSE,
    #     colour="grey50"
    # ) +
    geom_point_interactive(aes(
        x = dpt,
        y = vbeta,
        data_id = pat_id,
        tooltip = sprintf(
            "<b>Pat ID</b>: %s \n  <b>DPT</b>: %s \n <b>Median Expr (vbeta)</b>: %s \n <b>vbeta</b>: %s \n <b># cells</b>: %s \n <b># vbeta</b>: %s",
            pat_id,
            dpt,
            signif(vbeta, 3),
            pop,
            n_cells,
            n_vbeta
        )
    ),
    alpha = 0.75) +
    geom_text(aes(x=Inf, y=Inf, 
                  label=TeX(sprintf("$p_{}: %s$ ",signif(p_value,3)), output = "character")),
              vjust=1.3, hjust=1.01,
              parse = TRUE,
              data=vbeta_med_dpt_mixregr_p %>%
                  left_join(vbeta_med_dpt_regr_p %>%
                                rename(p_value_lr = p_value) %>%
                                select(p_value_lr, marker),
                            by="marker"),
              size=3) +
    facet_wrap(~ marker, scales = "free") +
    theme_bw() +
    labs(title = "", 
         color="Patient ID") 

p_med_exprs_dpt_bw

p_med_exprs_dpt_deceased <- pop_med_vbeta_rest_dpt %>%
    ggplot() +
    geom_abline(
        aes(intercept=intercept,
            slope=slope),
        data=lmer_df,
        linewidth=.5,
        alpha=.4
    ) +
    # geom_smooth(
    #     aes(
    #         x = dpt,
    #         y = vbeta
    #     ),
    #     method = "lm",
    #     formula = 'y ~ x',
    #     linewidth = 0.4,
    #     alpha = 0.3,
    #     se=FALSE,
    #     colour="grey50"
    # ) +
    geom_point_interactive(aes(
        x = dpt,
        y = vbeta,
        data_id = pat_id,
        tooltip = sprintf(
            "<b>Pat ID</b>: %s \n  <b>DPT</b>: %s \n <b>Median Expr (vbeta)</b>: %s \n <b>vbeta</b>: %s \n <b># cells</b>: %s \n <b># vbeta</b>: %s",
            pat_id,
            dpt,
            signif(vbeta, 3),
            pop,
            n_cells,
            n_vbeta
        ),
        shape=deceased,
        col=deceased
    ),
    alpha = 0.75) +
    geom_text(aes(x=Inf, y=Inf, 
                  label=TeX(sprintf("$p_{}: %s$ ",signif(p_value,3)), output = "character")),
              vjust=1.3, hjust=1.01,
              parse = TRUE,
              data=vbeta_med_dpt_mixregr_p %>%
                  left_join(vbeta_med_dpt_regr_p %>%
                                rename(p_value_lr = p_value) %>%
                                select(p_value_lr, marker),
                            by="marker"),
              size=3) +
    facet_wrap(~ marker, scales = "free") +
    scale_color_manual(values=cbp1)+
    theme_bw() +
    labs(title = "") 

p_med_exprs_dpt_deceased

ggiraph::girafe(
    ggobj=p_med_exprs_dpt_col,
    height_svg = 12,
    width_svg = 14,
    options = list(
        opts_hover(css = "stroke-width:2"),
        opts_tooltip(css = tooltip_css)
    )
)
# Export pdfs
export_figs <- FALSE

if (export_figs) {
    pdf.options(width=12, height=12)
    pdf(file=file.path(data_folder,paste0("data/fig/med_exprs_dpt_color.pdf")))
    print(p_med_exprs_dpt_col)
    dev.off()
    
    pdf.options(width=12, height=12)
    pdf(file=file.path(data_folder,paste0("data/fig/med_exprs_dpt_bw.pdf")))
    print(p_med_exprs_dpt_bw)
    dev.off()
    
    pdf.options(width=12, height=12)
    pdf(file=file.path(data_folder,paste0("data/fig/med_exprs_dpt_deceased.pdf")))
    print(p_med_exprs_dpt_deceased)
    dev.off()
    
    pdf.options(width=12, height=12)
    pdf(file=file.path(data_folder,paste0("data/fig/med_exprs_vbeta_v_rest.pdf")))
    print(p_med_exprs)
    dev.off()
}